In this notebook, we explore the application of Independent Component Analysis on the top 28 S&P 500 stock returns from 11/06 to 11/16. We provide the data at https://github.com/Freshdachs/FinData .
import numpy as np
import pandas as pd
import datetime
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.decomposition import FastICA, PCA
from itertools import islice
import functools
py.init_notebook_mode(connected=True)
plot = lambda d,lbls:py.iplot([go.Scatter(x=d.index,y=d[i],name=i) for i in lbls])
#tuple monad
unit_t = lambda x: x if isinstance(x,(list,tuple)) else (x,)
bind_t = lambda f,m: f(*m)
#single value monad
#unit_s = lambda x
def iterate(f,*x,unit=unit_t,bind=bind_t):
while 1:
x = bind(f,x)
yield x
x = unit(x)
def nth(n,it):
return islice(it,n)[-1]
#load timeseries
df=pd.read_csv('tss3.csv', sep=';',index_col=0,parse_dates=True,decimal=',')
#sort columns by sum
df = df[df.columns[np.argsort(df.apply(sum).values)][::-1]]
We use the daily adjusted closing prices of 441 stocks on 2458 day from 2011-11-1 to 2016-11-1 out of the S&P 500. We retrieved the data from Quandl and cleaned it up, so that only stocks and days are remaining where we have complete data available.
def pre_diff(df):
d=df.copy()
d.values[1:] =df.values[1:]-df.values[:-1]
return (d.drop([d.index[0]]),df[0:1])
def pre_mean(df):
d=df.copy()
return (d.apply(lambda i: i-np.mean(i)),d.apply(np.mean))
def pre_normalize(df):
d=df.copy()
fac=np.max(np.max(np.abs(d)))
return (d.apply(lambda i: i/fac), fac)
def pre_process(df):
d, base=pre_diff(df)
d, means = pre_mean(d)
d, fac=pre_normalize(d)
return (d,base,means,fac)
pre_df, base,means,fac = pre_process(df)
idcs = np.argsort(df.apply(max).values)
#lbls = [pre_df.columns[i] for i in idcs[-8:]]
#lbls+['MMM']
lbls = df.columns[:8].values
Below we compare the differential, normalized returns with the regular returns.
plot(pre_df,lbls)
plot(df,lbls)
in this section, we apply the ICA approach to our data.
# load ICA and fit our data
ica = FastICA(whiten=False,max_iter=1000)
#reduce datasize
d, base,means,fac = pre_process(df[df.columns[:28]])
# compute sources and invert-transformed signal S from dataframe d
def ica_d(d):
ica_d = ica.fit(d)
S = ica_d.transform(d)
return (S, ica_d.mixing_, ica_d.inverse_transform(S))
S,A,x = ica_d(d)
def reconstruct(df,x,base,means,fac):
d=df.copy()
x = x*fac+means.values
d.values[:] = (np.cumsum(x,0)+base.values)[:]
return base.append(d)
def ranking(S,A):
return np.apply_along_axis(lambda row: np.argsort(np.apply_along_axis(max,0,np.abs(S*row))),1,A)
def indices(R,i=0,n=8):
return (R[i][-n:],R[i][:n])
def sep_ics(S,A,R,i=0, n=8):
return list(np.dot(S[:,f],A[:,f].T) for f in indices(R,i,n))
#S_red = S[:,first_four]
#A_red = A[:,first_four]
#X_red = np.dot(S_red,A_red.T)
def to_df(d,v):
e=d.copy()
e.values[:]=v[:]
return e
plot(to_df(d,S),d.columns[indices(R)].values)
R=ranking(S,A)
X_maj, X_min =sep_ics(S,A,R)
rec_df = reconstruct(d,X_maj,base,means,fac)
min_df = reconstruct(d,X_min,base,means,fac)
plot(rec_df,lbls)
plot(min_df,lbls)
plot(df,lbls)
Here we see that both the minor components and the major components can be reconstructed to similar approximations. We see upwards trends in both our components, how can the be? We substract the mean from our data during the preprocessing. This means that even with a 0 signal from our reconstruction, we still add the mean during the reconstruction. Since we reconstruct the derivate, this addition translate to a constant slope.
This shows that we can successfully perform an ICA decomposition and back again.
In this section, we analyze what results a PCA approach yields.
pca = PCA()
# compute sources and invert-transformed signal S from dataframe d
def pca_d(d):
pca_d = pca.fit(d)
S=pca_d.transform(d)
return (S, pca_d.components_.T, pca_d.inverse_transform(S))
S,A,x = pca_d(d)
R = ranking(S,A)
X_maj, X_min =sep_ics(S,A,R)
rec_df = reconstruct(d,X_maj,base,means,fac)
min_df = reconstruct(d,X_min,base,means,fac)
plot(rec_df,lbls)
plot(min_df,lbls)
plot(df,lbls)
Random collection of code snippets + more.
topsources = lambda S,n: np.argsort( np.apply_along_axis(lambda x:np.max(np.abs(x)),0,S))[-n:]
topsources(S,8)
#show top
py.iplot([go.Scatter(y=S[i]) for i in topsources(S,8)])
np.argsort( np.apply_along_axis(np.max,0,S))